labeled <- read.csv("data/train.csv", header = TRUE, sep = ",")

unlabeled <- read.csv("data/test.csv", header = TRUE, sep = ",")
# plot the distribution of each variable in labeled dataset
par(mfrow = c(3, 4))
for (i in 2:13) {
    hist(labeled[, i], main = names(labeled)[i], xlab = names(labeled)[i])
}

# plot the distribution of each variable in unlabeled dataset
par(mfrow = c(3, 4))
for (i in 2:12) {
    hist(unlabeled[, i], main = names(unlabeled)[i], xlab = names(unlabeled)[i])
}

# scatter plot of each variable in labeled dataset
pairs(labeled[, 2:13], main = "Scatter plot of each variable in labeled dataset", pch = 0.1, cex = 0.05)

library(corrplot)
## corrplot 0.92 loaded
par(mfrow = c(1, 1))
# plot the correlation matrix
cor_mat <- cor(labeled[, 2:13])
corrplot(cor_mat, method = "color", type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45, addCoef.col = "black", 
         col = colorRampPalette(c("white", "red"))(100))

# boxplot of each variable in labeled dataset 
par(mfrow = c(3, 4))
for (i in 2:13) {
    boxplot(labeled[, i], main = names(labeled)[i], xlab = names(labeled)[i])
}